
cl = lapply(c(2020, 2016, 2012), \(x)
       open_dataset('data/cl_in/') %>%
         filter(vintage == x, address_type == 's') %>%
         transmute(parcel_id, uid = str_replace(uid, '.{2}$', ''), last, first_m) %>%
         as.data.table(key = c('parcel_id')
  ))

print('joining...')
site_matches <- rbindlist(list(
  cl[[2]][cl[[1]], on = .(parcel_id), nomatch = 0],
  cl[[3]][cl[[2]], on = .(parcel_id), nomatch = 0]
))

site_matches[, last_match := stringsim(last, i.last, 'jw', p=0.1)]
site_matches[, first_match := stringsim(first_m, i.first_m, 'jw', p=0.1)]

out <- site_matches[last_match > 0.85 & first_match > 0.85, .(uid, i.uid)]

write_dataset(out, 'data/cl_site_across/', max_rows_per_file = 1e6)